{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Copyright (C) 2016 - 2019 Pinard Liu(liujianping-ok@163.com)\n",
    "\n",
    "https://www.cnblogs.com/pinard\n",
    "\n",
    "Permission given to modify the code as long as you keep this declaration at the top\n",
    "\n",
    "用Spark学习FP Tree算法和PrefixSpan算法 https://www.cnblogs.com/pinard/p/6340162.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "#下面这些目录都是你自己机器的Spark安装目录和Java安装目录\n",
    "os.environ['SPARK_HOME'] = \"C:/Tools/spark-2.2.0-bin-hadoop2.6/\"\n",
    "os.environ['PYSPARK_PYTHON'] = \"C:/Users/tata/AppData/Local/Programs/Python/Python36/python.exe\"\n",
    "\n",
    "sys.path.append(\"C:/Tools/spark-2.2.0-bin-hadoop2.6/bin\")\n",
    "sys.path.append(\"C:/Tools/spark-2.2.0-bin-hadoop2.6/python\")\n",
    "sys.path.append(\"C:/Tools/spark-2.2.0-bin-hadoop2.6/python/pyspark\")\n",
    "sys.path.append(\"C:/Tools/spark-2.2.0-bin-hadoop2.6/python/lib\")\n",
    "sys.path.append(\"C:/Tools/spark-2.2.0-bin-hadoop2.6/python/lib/pyspark.zip\")\n",
    "sys.path.append(\"C:/Tools/spark-2.2.0-bin-hadoop2.6/python/lib/py4j-0.10.4-src.zip\")\n",
    "sys.path.append(\"C:/Program Files/Java/jdk1.8.0_171\")\n",
    "\n",
    "from pyspark import SparkContext\n",
    "from pyspark import SparkConf\n",
    "\n",
    "\n",
    "sc = SparkContext(\"local\",\"testing\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<SparkContext master=local appName=testing>\n"
     ]
    }
   ],
   "source": [
    "print (sc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "from  pyspark.mllib.fpm import FPGrowth\n",
    "data = [[\"A\", \"B\", \"C\", \"E\", \"F\",\"O\"], [\"A\", \"C\", \"G\"], [\"E\",\"I\"], [\"A\", \"C\",\"D\",\"E\",\"G\"], [\"A\", \"C\", \"E\",\"G\",\"L\"],\n",
    "       [\"E\",\"J\"],[\"A\",\"B\",\"C\",\"E\",\"F\",\"P\"],[\"A\",\"C\",\"D\"],[\"A\",\"C\",\"E\",\"G\",\"M\"],[\"A\",\"C\",\"E\",\"G\",\"N\"]]\n",
    "rdd = sc.parallelize(data, 2)\n",
    "#支持度阈值为20%\n",
    "model = FPGrowth.train(rdd, 0.2, 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[FreqItemset(items=['A'], freq=8),\n",
       " FreqItemset(items=['B'], freq=2),\n",
       " FreqItemset(items=['B', 'A'], freq=2),\n",
       " FreqItemset(items=['B', 'C'], freq=2),\n",
       " FreqItemset(items=['B', 'C', 'A'], freq=2),\n",
       " FreqItemset(items=['B', 'E'], freq=2),\n",
       " FreqItemset(items=['B', 'E', 'A'], freq=2),\n",
       " FreqItemset(items=['B', 'E', 'C'], freq=2),\n",
       " FreqItemset(items=['B', 'E', 'C', 'A'], freq=2),\n",
       " FreqItemset(items=['C'], freq=8),\n",
       " FreqItemset(items=['C', 'A'], freq=8),\n",
       " FreqItemset(items=['D'], freq=2),\n",
       " FreqItemset(items=['D', 'A'], freq=2),\n",
       " FreqItemset(items=['D', 'C'], freq=2),\n",
       " FreqItemset(items=['D', 'C', 'A'], freq=2),\n",
       " FreqItemset(items=['E'], freq=8),\n",
       " FreqItemset(items=['E', 'A'], freq=6),\n",
       " FreqItemset(items=['E', 'C'], freq=6),\n",
       " FreqItemset(items=['E', 'C', 'A'], freq=6),\n",
       " FreqItemset(items=['F'], freq=2),\n",
       " FreqItemset(items=['F', 'A'], freq=2),\n",
       " FreqItemset(items=['F', 'B'], freq=2),\n",
       " FreqItemset(items=['F', 'B', 'A'], freq=2),\n",
       " FreqItemset(items=['F', 'B', 'C'], freq=2),\n",
       " FreqItemset(items=['F', 'B', 'C', 'A'], freq=2),\n",
       " FreqItemset(items=['F', 'B', 'E'], freq=2),\n",
       " FreqItemset(items=['F', 'B', 'E', 'A'], freq=2),\n",
       " FreqItemset(items=['F', 'B', 'E', 'C'], freq=2),\n",
       " FreqItemset(items=['F', 'B', 'E', 'C', 'A'], freq=2),\n",
       " FreqItemset(items=['F', 'C'], freq=2),\n",
       " FreqItemset(items=['F', 'C', 'A'], freq=2),\n",
       " FreqItemset(items=['F', 'E'], freq=2),\n",
       " FreqItemset(items=['F', 'E', 'A'], freq=2),\n",
       " FreqItemset(items=['F', 'E', 'C'], freq=2),\n",
       " FreqItemset(items=['F', 'E', 'C', 'A'], freq=2),\n",
       " FreqItemset(items=['G'], freq=5),\n",
       " FreqItemset(items=['G', 'A'], freq=5),\n",
       " FreqItemset(items=['G', 'C'], freq=5),\n",
       " FreqItemset(items=['G', 'C', 'A'], freq=5),\n",
       " FreqItemset(items=['G', 'E'], freq=4),\n",
       " FreqItemset(items=['G', 'E', 'A'], freq=4),\n",
       " FreqItemset(items=['G', 'E', 'C'], freq=4),\n",
       " FreqItemset(items=['G', 'E', 'C', 'A'], freq=4)]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(model.freqItemsets().collect())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from  pyspark.mllib.fpm import PrefixSpan\n",
    "data = [\n",
    "   [['a'],[\"a\", \"b\", \"c\"], [\"a\",\"c\"],[\"d\"],[\"c\", \"f\"]],\n",
    "   [[\"a\",\"d\"], [\"c\"],[\"b\", \"c\"], [\"a\", \"e\"]],\n",
    "   [[\"e\", \"f\"], [\"a\", \"b\"], [\"d\",\"f\"],[\"c\"],[\"b\"]],\n",
    "   [[\"e\"], [\"g\"],[\"a\", \"f\"],[\"c\"],[\"b\"],[\"c\"]]\n",
    "   ]\n",
    "rdd = sc.parallelize(data, 2)\n",
    "model = PrefixSpan.train(rdd, 0.5,4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[FreqSequence(sequence=[['a']], freq=4),\n",
       " FreqSequence(sequence=[['a'], ['a']], freq=2),\n",
       " FreqSequence(sequence=[['a'], ['b']], freq=4),\n",
       " FreqSequence(sequence=[['a'], ['b'], ['a']], freq=2),\n",
       " FreqSequence(sequence=[['a'], ['b'], ['c']], freq=2),\n",
       " FreqSequence(sequence=[['a'], ['b', 'c']], freq=2),\n",
       " FreqSequence(sequence=[['a'], ['b', 'c'], ['a']], freq=2),\n",
       " FreqSequence(sequence=[['a'], ['c']], freq=4),\n",
       " FreqSequence(sequence=[['a'], ['c'], ['a']], freq=2),\n",
       " FreqSequence(sequence=[['a'], ['c'], ['b']], freq=3),\n",
       " FreqSequence(sequence=[['a'], ['c'], ['c']], freq=3),\n",
       " FreqSequence(sequence=[['a'], ['d']], freq=2),\n",
       " FreqSequence(sequence=[['a'], ['d'], ['c']], freq=2),\n",
       " FreqSequence(sequence=[['a'], ['f']], freq=2),\n",
       " FreqSequence(sequence=[['b']], freq=4),\n",
       " FreqSequence(sequence=[['b'], ['a']], freq=2),\n",
       " FreqSequence(sequence=[['b'], ['c']], freq=3),\n",
       " FreqSequence(sequence=[['b'], ['d']], freq=2),\n",
       " FreqSequence(sequence=[['b'], ['d'], ['c']], freq=2),\n",
       " FreqSequence(sequence=[['b'], ['f']], freq=2),\n",
       " FreqSequence(sequence=[['b', 'a']], freq=2),\n",
       " FreqSequence(sequence=[['b', 'a'], ['c']], freq=2),\n",
       " FreqSequence(sequence=[['b', 'a'], ['d']], freq=2),\n",
       " FreqSequence(sequence=[['b', 'a'], ['d'], ['c']], freq=2),\n",
       " FreqSequence(sequence=[['b', 'a'], ['f']], freq=2),\n",
       " FreqSequence(sequence=[['b', 'c']], freq=2),\n",
       " FreqSequence(sequence=[['b', 'c'], ['a']], freq=2),\n",
       " FreqSequence(sequence=[['c']], freq=4),\n",
       " FreqSequence(sequence=[['c'], ['a']], freq=2),\n",
       " FreqSequence(sequence=[['c'], ['b']], freq=3),\n",
       " FreqSequence(sequence=[['c'], ['c']], freq=3),\n",
       " FreqSequence(sequence=[['d']], freq=3),\n",
       " FreqSequence(sequence=[['d'], ['b']], freq=2),\n",
       " FreqSequence(sequence=[['d'], ['c']], freq=3),\n",
       " FreqSequence(sequence=[['d'], ['c'], ['b']], freq=2),\n",
       " FreqSequence(sequence=[['e']], freq=3),\n",
       " FreqSequence(sequence=[['e'], ['a']], freq=2),\n",
       " FreqSequence(sequence=[['e'], ['a'], ['b']], freq=2),\n",
       " FreqSequence(sequence=[['e'], ['a'], ['c']], freq=2),\n",
       " FreqSequence(sequence=[['e'], ['a'], ['c'], ['b']], freq=2),\n",
       " FreqSequence(sequence=[['e'], ['b']], freq=2),\n",
       " FreqSequence(sequence=[['e'], ['b'], ['c']], freq=2),\n",
       " FreqSequence(sequence=[['e'], ['c']], freq=2),\n",
       " FreqSequence(sequence=[['e'], ['c'], ['b']], freq=2),\n",
       " FreqSequence(sequence=[['e'], ['f']], freq=2),\n",
       " FreqSequence(sequence=[['e'], ['f'], ['b']], freq=2),\n",
       " FreqSequence(sequence=[['e'], ['f'], ['c']], freq=2),\n",
       " FreqSequence(sequence=[['e'], ['f'], ['c'], ['b']], freq=2),\n",
       " FreqSequence(sequence=[['f']], freq=3),\n",
       " FreqSequence(sequence=[['f'], ['b']], freq=2),\n",
       " FreqSequence(sequence=[['f'], ['b'], ['c']], freq=2),\n",
       " FreqSequence(sequence=[['f'], ['c']], freq=2),\n",
       " FreqSequence(sequence=[['f'], ['c'], ['b']], freq=2)]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted(model.freqSequences().collect())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
